Detected ASvs
df <- as.data.frame(sample_sums(phy))
colnames(df) <- c("Number_of_ASVs")
df %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position",
"repeat_header", position = "")) %>% scroll_box(width = "100%", height = "200px")
|
|
Number_of_ASVs
|
|
S1
|
61063
|
|
S100
|
44511
|
|
S101
|
96603
|
|
S102
|
82465
|
|
S103
|
187655
|
|
S104
|
255049
|
|
S105
|
117233
|
|
S106
|
292794
|
|
S107
|
101782
|
|
S108
|
207574
|
|
S109
|
207872
|
|
S11
|
73001
|
|
S111
|
227189
|
|
S112
|
33625
|
|
S114
|
203251
|
|
S115
|
52786
|
|
S116
|
101199
|
|
S117
|
180747
|
|
S118
|
11031
|
|
S119
|
155800
|
|
S120
|
1044629
|
|
S121
|
210655
|
|
S122
|
286507
|
|
S123
|
140107
|
|
S124
|
80418
|
|
S126
|
253049
|
|
S127
|
387407
|
|
S128
|
171355
|
|
S129
|
64913
|
|
S130
|
279534
|
|
S131
|
165378
|
|
S132
|
68077
|
|
S134
|
80024
|
|
S135
|
189047
|
|
S137
|
34106
|
|
S138
|
67806
|
|
S139
|
227654
|
|
S14
|
164310
|
|
S140
|
20488
|
|
S141
|
81609
|
|
S142
|
27438
|
|
S143
|
191231
|
|
S145
|
55640
|
|
S15
|
123342
|
|
S16
|
27934
|
|
S17
|
152074
|
|
S18
|
36082
|
|
S2
|
128524
|
|
S21
|
9660
|
|
S24
|
68538
|
|
S25
|
89019
|
|
S26
|
92954
|
|
S28
|
39466
|
|
S29
|
60813
|
|
S3
|
15803
|
|
S31
|
67170
|
|
S33
|
21611
|
|
S34
|
127017
|
|
S35
|
92412
|
|
S37
|
101523
|
|
S38
|
88239
|
|
S39
|
71791
|
|
S4
|
124413
|
|
S40
|
53615
|
|
S42
|
107510
|
|
S43
|
20449
|
|
S44
|
55475
|
|
S46
|
60012
|
|
S47
|
116267
|
|
S48
|
4169
|
|
S50
|
76754
|
|
S51
|
67641
|
|
S52
|
9923
|
|
S53
|
117037
|
|
S54
|
120593
|
|
S55
|
116434
|
|
S56
|
85267
|
|
S57
|
20962
|
|
S58
|
72234
|
|
S59
|
130361
|
|
S60
|
80710
|
|
S61
|
116822
|
|
S62
|
117170
|
|
S63
|
74284
|
|
S64
|
103070
|
|
S65
|
167208
|
|
S66
|
41735
|
|
S67
|
201104
|
|
S68
|
35786
|
|
S69
|
92742
|
|
S71
|
94256
|
|
S72
|
88623
|
|
S74
|
92403
|
|
S75
|
140253
|
|
S77
|
76904
|
|
S78
|
140787
|
|
S79
|
80142
|
|
S8
|
24153
|
|
S80
|
23839
|
|
S81
|
20228
|
|
S82
|
177630
|
|
S83
|
148485
|
|
S84
|
227887
|
|
S85
|
181698
|
|
S86
|
38291
|
|
S88
|
22225
|
|
S89
|
111213
|
|
S9
|
62775
|
|
S90
|
16359
|
|
S92
|
46453
|
|
S93
|
173129
|
|
S94
|
37394
|
|
S95
|
4068
|
|
S96
|
110212
|
|
S97
|
261993
|
|
S98
|
287978
|
phy <- subset_samples(phy, !is.na(Inflammation))
phy <- subset_samples(phy, sample_sums(phy) > 5000)
phy <- subset_samples(phy, sample != "S120")
phy
phyloseq-class experiment-level object
otu_table() OTU Table: [ 3222 taxa and 111 samples ]
sample_data() Sample Data: [ 111 samples by 5 sample variables ]
tax_table() Taxonomy Table: [ 3222 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 3222 tips and 3218 internal nodes ]
total = median(sample_sums(phy))
standf = function(x, t = total) round(t * (x/sum(x)))
M.std = transform_sample_counts(phy, standf)
M.std = filter_taxa(M.std, function(x) sum(x > 10) > (0.01 * length(x)) | sum(x) >
0.001 * total, TRUE)
# M.f = filter_taxa(M.std,function(x) sum(x) > 0.001*total, TRUE)
ntaxa(M.std)
[1] 1075
require(cluster)
library(factoextra)
df1 <- data.frame(otu_table(M.std))
df_viz <- fviz_nbclust(df1, kmeans, method = "wss") + theme_minimal()
res_fanny <- fanny(t(df1), k = 3, metric = "SqEuclidean")
sample_data(M.std)$cluster <- as.character(res_fanny$clustering)
# clusters<-NULL memb_prob<-res_fanny$membership
# sample_ids<-rownames(memb_prob) for (i in 1:dim(memb_prob)[1]) {
# memb_prob_<-max(memb_prob[i,]) tmp<-data.frame(sample_ids[i], memb_prob_)
# clusters<-rbind(clusters, tmp) }
require(microbiomeSeq)
require(pheatmap)
phyTop <- taxa_level(M.std, "G_species")
TopASVs <- names(sort(taxa_sums(phyTop), TRUE))[1:50]
df1 <- as.data.frame(otu_table(t(phyTop)))
df1 <- df1[which(rownames(df1) %in% TopASVs), ]
df1 <- df1[which(rownames(df1) != " "), ]
df <- data.frame((sample_data(phyTop))[, c("Inflammation", "BV")])
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F,
cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

require(plotly)
p <- plot_richness(M.std, x = "cluster", color = "cluster", measures = c("Observed",
"Simpson"), title = paste0("Standardized to total reads"))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10,
face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
ggplotly(p)
p <- plot_richness(M.std, x = "Inflammation", color = "Inflammation", measures = c("Observed",
"Simpson"), title = paste0("Standardized to total reads"))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10,
face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
ggplotly(p)
p <- plot_richness(M.std, x = "BV", color = "BV", measures = c("Observed", "Simpson"),
title = paste0("Standardized to total reads"))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10,
face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
ggplotly(p)
est <- estimate_richness(M.std, split = TRUE, measures = c("Simpson"))
BV_div <- cbind(est, sample_data(M.std)[, "BV"])
t <- wilcox.test(BV_div$Simpson ~ BV_div$BV)
t
Wilcoxon rank sum test with continuity correction
data: BV_div$Simpson by BV_div$BV
W = 398, p-value = 1.71e-11
alternative hypothesis: true location shift is not equal to 0
BInflammation_div <- cbind(est, sample_data(M.std)[, "Inflammation"])
t <- wilcox.test(BInflammation_div$Simpson ~ BInflammation_div$Inflammation)
t
Wilcoxon rank sum test with continuity correction
data: BInflammation_div$Simpson by BInflammation_div$Inflammation
W = 2204, p-value = 3.854e-05
alternative hypothesis: true location shift is not equal to 0
Cluster_div <- cbind(est, sample_data(M.std)[, "cluster"])
dunn.test::dunn.test(Cluster_div$Simpson, Cluster_div$cluster)
Kruskal-Wallis rank sum test
data: x and group
Kruskal-Wallis chi-squared = 58.3445, df = 2, p-value = 0
Comparison of x by group
(No adjustment)
Col Mean-|
Row Mean | 1 2
---------+----------------------
2 | -5.906293
| 0.0000*
|
3 | -0.080297 6.501771
| 0.4680 0.0000*
alpha = 0.05
Reject Ho if p <= alpha/2
# ordination based on NMDS and bray-curtis distance
NMDS_bray <- ordinate(M.std, "NMDS")
Square root transformation
Wisconsin double standardization
Run 0 stress 0.2055082
Run 1 stress 0.2309574
Run 2 stress 0.217807
Run 3 stress 0.2221731
Run 4 stress 0.1968815
... New best solution
... Procrustes: rmse 0.0485541 max resid 0.2755098
Run 5 stress 0.222773
Run 6 stress 0.2169525
Run 7 stress 0.2262007
Run 8 stress 0.226073
Run 9 stress 0.2171289
Run 10 stress 0.2318284
Run 11 stress 0.2278323
Run 12 stress 0.2134497
Run 13 stress 0.2192831
Run 14 stress 0.2118285
Run 15 stress 0.2277937
Run 16 stress 0.2111281
Run 17 stress 0.2105766
Run 18 stress 0.4130426
Run 19 stress 0.2136909
Run 20 stress 0.2176685
*** No convergence -- monoMDS stopping criteria:
20: stress ratio > sratmax
title = c("NMDS of 16S microbiome, Bray-Curtis distance")
NMDS.bray.plot <- plot_ordination(M.std, NMDS_bray, color = "BV", shape = "Inflammation",
title = title)
NMDS.bray.plot <- NMDS.bray.plot + theme(axis.text = element_text(size = 16,
face = "bold"), axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) +
theme_bw() + labs(color = "BV") + geom_point(size = 5)
NMDS.bray.plot

PCoA.ord.bray <- ordinate(M.std, "PCoA", "bray")
title = c("PCoA of 16S microbiome, Bray-Curtis distance")
PCoA.ord <- plot_ordination(M.std, PCoA.ord.bray, color = "cluster", shape = "BV",
title = title)
PCoA.ord <- PCoA.ord + theme(axis.text = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) +
theme_bw() + labs(color = "cluster") + geom_point(size = 5)
PCoA.ord

# permanova
require(vegan)
phy_bray <- phyloseq::distance(M.std, method = "bray")
sampledf <- data.frame(sample_data(M.std))
adonis(phy_bray ~ BV, data = sampledf)
Call:
adonis(formula = phy_bray ~ BV, data = sampledf)
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
BV 1 7.396 7.3956 27.198 0.19969 0.001 ***
Residuals 109 29.639 0.2719 0.80031
Total 110 37.035 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.00977 0.0097732 0.5461 999 0.469
Residuals 109 1.95083 0.0178975
adonis(phy_bray ~ Inflammation, data = sampledf)
Call:
adonis(formula = phy_bray ~ Inflammation, data = sampledf)
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
Inflammation 1 3.676 3.6759 12.011 0.09926 0.001 ***
Residuals 109 33.359 0.3060 0.90074
Total 110 37.035 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$Inflammation)
permutest(beta)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.00025 0.0002487 0.0177 999 0.904
Residuals 109 1.53154 0.0140509
adonis(phy_bray ~ cluster, data = sampledf)
Call:
adonis(formula = phy_bray ~ cluster, data = sampledf)
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
cluster 2 17.856 8.9278 50.273 0.48213 0.001 ***
Residuals 108 19.179 0.1776 0.51787
Total 110 37.035 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.00977 0.0097732 0.5461 999 0.448
Residuals 109 1.95083 0.0178975
sampledf$cluster <- as.factor(sampledf$cluster)
logit_model <- glm(Inflammation ~ cluster + BMI + Age, family = binomial(link = "logit"),
data = sampledf)
summary(logit_model)
Call:
glm(formula = Inflammation ~ cluster + BMI + Age, family = binomial(link = "logit"),
data = sampledf)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8371 -0.7213 -0.6278 0.8985 1.9101
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.24683 3.05545 0.408 0.683
cluster2 -2.59960 0.62924 -4.131 3.61e-05 ***
cluster3 -0.71255 0.65552 -1.087 0.277
BMI 0.03651 0.05259 0.694 0.488
Age -0.04953 0.14773 -0.335 0.737
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 137.19 on 99 degrees of freedom
Residual deviance: 111.28 on 95 degrees of freedom
(11 observations deleted due to missingness)
AIC: 121.28
Number of Fisher Scoring iterations: 4
exp(logit_model$coefficients)
(Intercept) cluster2 cluster3 BMI Age
3.47929507 0.07430301 0.49039354 1.03718010 0.95167251
exp(confint(logit_model))
2.5 % 97.5 %
(Intercept) 0.007890584 1405.7584524
cluster2 0.019615802 0.2383974
cluster3 0.127073175 1.7228302
BMI 0.935752657 1.1525993
Age 0.710311734 1.2750525
Wald test:
----------
Chi-squared test:
X2 = 17.3, df = 2, P(> X2) = 0.00018
# Differential abundance using DESEQ2
require(DESeq2)
M.f.des <- taxa_level(M.std, "G_species")
dds <- phyloseq_to_deseq2(M.f.des, ~Inflammation)
# geometric mean, set to zero when all coordinates are zero
geo_mean_protected <- function(x) {
if (all(x == 0)) {
return(0)
}
exp(mean(log(x[x != 0])))
}
geoMeans <- apply(counts(dds), 1, geo_mean_protected)
dds <- estimateSizeFactors(dds, geoMeans = geoMeans)
dds <- DESeq(dds, fitType = "local")
res = results(dds, cooksCutoff = FALSE)
df <- as.data.frame(colData(dds)[, "BV"])
colnames(df) <- "BV"
alpha = 0.01
sigtab = as.data.frame(res[which(res$padj < alpha & abs(res$log2FoldChange) >
1.5), ])
# sigtab = cbind(as(sigtab, 'data.frame'),
# as(tax_table(M.f.des)[rownames(sigtab), ], 'matrix'))
dim(sigtab)
[1] 15 6
table_sig <- sigtab[c("log2FoldChange", "padj")]
table_sig$Abundant_Group <- levels(df$BV)[as.numeric(sigtab$log2FoldChange >
0) + 1]
table_sig %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position",
"repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
|
|
log2FoldChange
|
padj
|
Abundant_Group
|
|
Sneathia
|
-3.546803
|
0.0078426
|
Negative
|
|
Sneathia Sneathia amnii
|
-2.258075
|
0.0016651
|
Negative
|
|
Streptococcus Streptococcus anginosus subsp. anginosus
|
3.480048
|
0.0078426
|
Positive
|
|
Gemella Gemella asaccharolytica
|
-2.803589
|
0.0016651
|
Negative
|
|
Lactobacillus
|
2.150335
|
0.0056942
|
Positive
|
|
Anaerococcus uncultured Anaerococcus sp.
|
2.084764
|
0.0078426
|
Positive
|
|
Moryella Moryella sp. KHD1
|
-3.394192
|
0.0090199
|
Negative
|
|
Fastidiosipila Clostridiales bacterium oral clone MCE3_9
|
1.795469
|
0.0036219
|
Positive
|
|
Fastidiosipila unidentified
|
-3.837239
|
0.0008019
|
Negative
|
|
Shuttleworthia uncultured bacterium
|
-5.976390
|
0.0000000
|
Negative
|
|
Megasphaera
|
-3.245417
|
0.0000000
|
Negative
|
|
Sutterella Sutterella sp. Marseille-P3161
|
-5.566145
|
0.0056942
|
Negative
|
|
Porphyromonas Bacteroidales bacterium KA00251
|
-4.642502
|
0.0001815
|
Negative
|
|
Prevotella Prevotella sp. DNF00663
|
-3.944970
|
0.0066978
|
Negative
|
|
Prevotella
|
-4.997841
|
0.0000000
|
Negative
|
library("pheatmap")
select <- order(rowMeans(counts(dds, normalized = TRUE)), decreasing = TRUE)[1:50]
df <- as.data.frame(colData(dds)[, c("Inflammation", "BV", "cluster")])
df1 <- data.frame(assay(dds))[select, ]
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F,
cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

require(randomForest)
data.rf <- data.frame(otu_table(M.f.des))
data.rf$BV <- sample_data(M.std)$BV
BV.rf <- randomForest(BV ~ ., data = data.rf, importance = TRUE, proximity = TRUE)
df_accuracy <- as.data.frame(BV.rf$importance)
df_accuracy$Taxa <- rownames(df_accuracy)
df_accuracy <- df_accuracy[order(df_accuracy$MeanDecreaseAccuracy, decreasing = TRUE),
][1:20, ]
df_accuracy$Taxa <- factor(df_accuracy$Taxa, levels = df_accuracy$Taxa)
mda_plot <- ggplot(data = df_accuracy, aes(x = Taxa, y = MeanDecreaseAccuracy)) +
theme_bw()
mda_plot <- mda_plot + geom_bar(stat = "identity", fill = "darkblue", width = 0.5)
mda_plot <- mda_plot + theme(axis.text.x = element_text(angle = 90, hjust = 1,
vjust = 0.5))
mda_plot <- mda_plot + xlab("") + ylab("Mean Decrease in Accuracy") + coord_flip() +
theme(axis.text = element_text(size = 10, face = "bold"), axis.title = element_text(size = 16,
face = "bold"))
mda_plot

data.rf_reg <- data.frame(otu_table(M.f.des))
data.rf$BMI <- sample_data(M.f.des)$BMI
BMI.reg <- randomForest(BMI ~ ., data = data.rf, importance = TRUE, proximity = TRUE,
na.action = na.omit)
# BMI.reg$importance View(BMI.reg$importance)
R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] randomForest_4.6-14 DESeq2_1.16.1
[3] SummarizedExperiment_1.6.5 DelayedArray_0.2.7
[5] matrixStats_0.54.0 Biobase_2.36.2
[7] GenomicRanges_1.28.6 GenomeInfoDb_1.12.3
[9] IRanges_2.10.5 S4Vectors_0.14.7
[11] BiocGenerics_0.22.1 aod_1.3.1
[13] vegan_2.5-4 lattice_0.20-38
[15] permute_0.9-5 pheatmap_1.0.12
[17] microbiomeSeq_0.1 factoextra_1.0.5
[19] cluster_2.0.7-1 plotly_4.9.0
[21] ggplot2_3.2.0 phyloseq_1.25.2
[23] kableExtra_1.1.0 rmdformats_0.3.5
[25] knitr_1.23
loaded via a namespace (and not attached):
[1] backports_1.1.4 uuid_0.1-2 Hmisc_4.2-0
[4] plyr_1.8.4 igraph_1.2.4 lazyeval_0.2.2
[7] sp_1.3-1 splines_3.4.1 BiocParallel_1.10.1
[10] crosstalk_1.0.0 rncl_0.8.3 digest_0.6.19
[13] foreach_1.4.4 htmltools_0.3.6 gdata_2.18.0
[16] memoise_1.1.0 checkmate_1.9.3 magrittr_1.5
[19] annotate_1.54.0 Biostrings_2.44.2 readr_1.3.1
[22] gmodels_2.18.1 prettyunits_1.0.2 colorspace_1.4-1
[25] blob_1.1.1 rvest_0.3.4 ggrepel_0.8.1
[28] xfun_0.8 dplyr_0.8.0.1 crayon_1.3.4
[31] RCurl_1.95-4.12 jsonlite_1.6 genefilter_1.58.1
[34] phylobase_0.8.6 survival_2.44-1.1 iterators_1.0.10
[37] ape_5.3 glue_1.3.1 gtable_0.3.0
[40] zlibbioc_1.22.0 XVector_0.16.0 webshot_0.5.1
[43] seqinr_3.4-5 questionr_0.7.0 dunn.test_1.3.5
[46] adegraphics_1.0-15 scales_1.0.0 DBI_1.0.0
[49] miniUI_0.1.1.1 Rcpp_1.0.1 htmlTable_1.13.1
[52] viridisLite_0.3.0 xtable_1.8-4 progress_1.2.2
[55] spData_0.3.0 bit_1.1-14 foreign_0.8-71
[58] spdep_0.7-9 Formula_1.2-3 htmlwidgets_1.3
[61] httr_1.4.0 RColorBrewer_1.1-2 acepack_1.4.1
[64] pkgconfig_2.0.2 XML_3.98-1.20 nnet_7.3-12
[67] deldir_0.1-16 locfit_1.5-9.1 AnnotationDbi_1.38.2
[70] tidyselect_0.2.5 labeling_0.3 rlang_0.3.1
[73] reshape2_1.4.3 later_0.8.0 munsell_0.5.0
[ reached getOption("max.print") -- omitted 56 entries ]